// =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= //
//
//  Project:   Talina Gaming System (TgS) (∂)
//  File:      TgS (W32) Common - Math API [Matrix] [M] [F] [44].c
//  Author:    Andrew Aye (EMail: mailto:andrew.aye@gmail.com, Web: http://www.andrewaye.com)
//  Version:   4.0
//
// ------------------------------------------------------------------------------------------------------------------------------ //
//
//  Copyright: © 2002-2010, Andrew Aye.  All Rights Reserved.
//
//  This software is free for non-commercial use. Redistribution and use in source and binary forms, with or without modification,
//  are permitted provided that the following conditions are met: 
//    Redistributions of source code must retain this copyright notice, this list of conditions and the following disclaimers. 
//    Redistributions in binary form must reproduce this copyright notice, this list of conditions and the following
//      disclaimers in the documentation and other materials provided with the distribution. 
//
//  Neither the names of the copyright owner nor the names of its contributors may be used to endorse or promote products derived
//  from this software without specific prior written permission. 
//
//  The intellectual property rights of the algorithms used reside with Andrew Aye.  You may not use this software, in whole or
//  in part, in support of any commercial product without the express written consent of the author.
//
//  There is no warranty or other guarantee of fitness of this software for any purpose. It is provided solely "as is".
//
// =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= //

// ---- MATRIX ARITHMETIC OPERATIONS -------------------------------------------------------------------------------------------- //

TgVOID M_CAT_F32_44( PCU_TgMAT_F32_44 ptmRet, CPCU_TgMAT_F32_44 ptmM0, CPCU_TgMAT_F32_44 ptmM1 )
{
    register const __m128               mi00 = _mm_shuffle_ps( ptmM0->m_atvRow[0].m_mData, ptmM0->m_atvRow[0].m_mData, 0x00 );
    register const __m128               mi01 = _mm_shuffle_ps( ptmM0->m_atvRow[0].m_mData, ptmM0->m_atvRow[0].m_mData, 0x55 );
    register const __m128               mi02 = _mm_shuffle_ps( ptmM0->m_atvRow[0].m_mData, ptmM0->m_atvRow[0].m_mData, 0xAA );
    register const __m128               mi03 = _mm_shuffle_ps( ptmM0->m_atvRow[0].m_mData, ptmM0->m_atvRow[0].m_mData, 0xFF );
    register const __m128               mi04 = _mm_shuffle_ps( ptmM0->m_atvRow[1].m_mData, ptmM0->m_atvRow[1].m_mData, 0x00 );
    register const __m128               mi05 = _mm_shuffle_ps( ptmM0->m_atvRow[1].m_mData, ptmM0->m_atvRow[1].m_mData, 0x55 );
    register const __m128               mi06 = _mm_shuffle_ps( ptmM0->m_atvRow[1].m_mData, ptmM0->m_atvRow[1].m_mData, 0xAA );
    register const __m128               mi07 = _mm_shuffle_ps( ptmM0->m_atvRow[1].m_mData, ptmM0->m_atvRow[1].m_mData, 0xFF );
    register const __m128               mi08 = _mm_shuffle_ps( ptmM0->m_atvRow[2].m_mData, ptmM0->m_atvRow[2].m_mData, 0x00 );
    register const __m128               mi09 = _mm_shuffle_ps( ptmM0->m_atvRow[2].m_mData, ptmM0->m_atvRow[2].m_mData, 0x55 );
    register const __m128               mi10 = _mm_shuffle_ps( ptmM0->m_atvRow[2].m_mData, ptmM0->m_atvRow[2].m_mData, 0xAA );
    register const __m128               mi11 = _mm_shuffle_ps( ptmM0->m_atvRow[2].m_mData, ptmM0->m_atvRow[2].m_mData, 0xFF );
    register const __m128               mi12 = _mm_shuffle_ps( ptmM0->m_atvRow[3].m_mData, ptmM0->m_atvRow[3].m_mData, 0x00 );
    register const __m128               mi13 = _mm_shuffle_ps( ptmM0->m_atvRow[3].m_mData, ptmM0->m_atvRow[3].m_mData, 0x55 );
    register const __m128               mi14 = _mm_shuffle_ps( ptmM0->m_atvRow[3].m_mData, ptmM0->m_atvRow[3].m_mData, 0xAA );
    register const __m128               mi15 = _mm_shuffle_ps( ptmM0->m_atvRow[3].m_mData, ptmM0->m_atvRow[3].m_mData, 0xFF );

    register const __m128               mi16 = _mm_mul_ps( mi00, ptmM1->m_atvRow[0].m_mData );
    register const __m128               mi17 = _mm_mul_ps( mi04, ptmM1->m_atvRow[0].m_mData );
    register const __m128               mi18 = _mm_mul_ps( mi08, ptmM1->m_atvRow[0].m_mData );
    register const __m128               mi19 = _mm_mul_ps( mi12, ptmM1->m_atvRow[0].m_mData );
    register const __m128               mi20 = _mm_mul_ps( mi01, ptmM1->m_atvRow[1].m_mData );
    register const __m128               mi21 = _mm_mul_ps( mi05, ptmM1->m_atvRow[1].m_mData );
    register const __m128               mi22 = _mm_mul_ps( mi09, ptmM1->m_atvRow[1].m_mData );
    register const __m128               mi23 = _mm_mul_ps( mi13, ptmM1->m_atvRow[1].m_mData );
    register const __m128               mi24 = _mm_mul_ps( mi02, ptmM1->m_atvRow[2].m_mData );
    register const __m128               mi25 = _mm_mul_ps( mi06, ptmM1->m_atvRow[2].m_mData );
    register const __m128               mi26 = _mm_mul_ps( mi10, ptmM1->m_atvRow[2].m_mData );
    register const __m128               mi27 = _mm_mul_ps( mi14, ptmM1->m_atvRow[2].m_mData );
    register const __m128               mi28 = _mm_mul_ps( mi03, ptmM1->m_atvRow[3].m_mData );
    register const __m128               mi29 = _mm_mul_ps( mi07, ptmM1->m_atvRow[3].m_mData );
    register const __m128               mi30 = _mm_mul_ps( mi11, ptmM1->m_atvRow[3].m_mData );
    register const __m128               mi31 = _mm_mul_ps( mi15, ptmM1->m_atvRow[3].m_mData );

    ptmRet->m_atvRow[0].m_mData = _mm_add_ps( _mm_add_ps( mi16, mi20 ), _mm_add_ps( mi24, mi28 ) );
    ptmRet->m_atvRow[1].m_mData = _mm_add_ps( _mm_add_ps( mi17, mi21 ), _mm_add_ps( mi25, mi29 ) );
    ptmRet->m_atvRow[2].m_mData = _mm_add_ps( _mm_add_ps( mi18, mi22 ), _mm_add_ps( mi26, mi30 ) );
    ptmRet->m_atvRow[3].m_mData = _mm_add_ps( _mm_add_ps( mi19, mi23 ), _mm_add_ps( mi27, mi31 ) );
};




// ---- INVERSE FUNCTIONS ------------------------------------------------------------------------------------------------------- //

TgVOID M_INV_DET_F32_44( PCU_TgMAT_F32_44 ptmRet, C_TgVEC_M_F32_04 tvDet, CPCU_TgMAT_F32_44 ptmM1 )
{
    //  Construct the four corner matrices of the 4x4.  Note: this is probably an inefficient method to take the inverse of a
    // standard linear transform matrix since it cant take into account the known information of the row or column of zeros
    // that normally exists in the matrix.  However, since inverse operations should not be terribly time sensitive this may
    // not be a problem.

    const register __m128               miR0 = ptmM1->m_atvRow[0].m_mData;
    const register __m128               miR1 = ptmM1->m_atvRow[1].m_mData;
    const register __m128               miR2 = ptmM1->m_atvRow[2].m_mData;
    const register __m128               miR3 = ptmM1->m_atvRow[3].m_mData;

    const register __m128               mi01 = _mm_shuffle_ps( miR0, miR1, _MM_PERM( 0, 1, 0, 1 ) ); // 00, 01, 10, 11
    const register __m128               mi02 = _mm_shuffle_ps( miR0, miR1, _MM_PERM( 2, 3, 2, 3 ) ); // 02, 03, 12, 13
    const register __m128               mi03 = _mm_shuffle_ps( miR2, miR3, _MM_PERM( 0, 1, 0, 1 ) ); // 20, 21, 30, 31
    const register __m128               mi04 = _mm_shuffle_ps( miR2, miR3, _MM_PERM( 2, 3, 2, 3 ) ); // 22, 23, 32, 33

    const register __m128               mi05 = _mm_shuffle_ps( miR0, miR1, _MM_PERM( 0, 1, 2, 3 ) ); // 00, 01, 12, 13
    const register __m128               mi06 = _mm_shuffle_ps( miR1, miR0, _MM_PERM( 0, 1, 2, 3 ) ); // 10, 11, 02, 03
    const register __m128               mi07 = _mm_shuffle_ps( miR2, miR3, _MM_PERM( 0, 1, 2, 3 ) ); // 20, 21, 32, 33
    const register __m128               mi08 = _mm_shuffle_ps( miR3, miR2, _MM_PERM( 0, 1, 2, 3 ) ); // 30, 31, 22, 23

    const register __m128               miC0 = _mm_shuffle_ps( mi01, mi03, _MM_PERM( 0, 2, 0, 2 ) ); // 00, 10, 20, 30
    const register __m128               miC1 = _mm_shuffle_ps( mi01, mi03, _MM_PERM( 1, 3, 1, 3 ) ); // 01, 11, 21, 31

    const register __m128               mi011 = _mm_shuffle_ps( miC1, miC1, _MM_PERM( 1, 0, 0, 0 ) ); // 11 01 01 01
    const register __m128               mi013 = _mm_shuffle_ps( miR3, mi04, _MM_PERM( 3, 2, 3, 0 ) ); // 33 32 33 22
    const register __m128               mi015 = _mm_shuffle_ps( miR3, mi04, _MM_PERM( 2, 3, 2, 1 ) ); // 32 33 32 23
    const register __m128               mi012 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 2, 3, 2, 3 ) ); // 22 23 12 13
    const register __m128               mi014 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 3, 2, 3, 2 ) ); // 23 22 13 12
    const register __m128               mi016 = _mm_mul_ps( mi012, mi013 );
    const register __m128               mi017 = _mm_mul_ps( mi014, mi015 );
    const register __m128               mi018 = _mm_sub_ps( mi016, mi017 );
    const register __m128               mi019 = _mm_mul_ps( mi011, mi018 );
    const register __m128               mi020 = _mm_div_ps( mi019, tvDet );
    const register __m128               mi021 = _mm_shuffle_ps( miC1, miC1, _MM_PERM( 2, 2, 1, 1 ) ); // 21 21 11 11
    const register __m128               mi023 = _mm_shuffle_ps( miR3, mi04, _MM_PERM( 2, 3, 2, 1 ) ); // 32 33 32 23
    const register __m128               mi025 = _mm_shuffle_ps( miR3, mi04, _MM_PERM( 3, 2, 3, 0 ) ); // 33 32 33 22
    const register __m128               mi022 = _mm_shuffle_ps( mi02, miR0, _MM_PERM( 3, 0, 3, 2 ) ); // 13 02 03 02
    const register __m128               mi024 = _mm_shuffle_ps( mi02, miR0, _MM_PERM( 2, 1, 2, 3 ) ); // 12 03 02 03
    const register __m128               mi026 = _mm_mul_ps( mi022, mi023 );
    const register __m128               mi027 = _mm_mul_ps( mi024, mi025 );
    const register __m128               mi028 = _mm_sub_ps( mi026, mi027 );
    const register __m128               mi029 = _mm_mul_ps( mi021, mi028 );
    const register __m128               mi030 = _mm_div_ps( mi029, tvDet );
    const register __m128               mi031 = _mm_shuffle_ps( miC1, miC1, _MM_PERM( 3, 3, 3, 2 ) ); // 31 31 31 21
    const register __m128               mi032 = _mm_shuffle_ps( mi02, miR0, _MM_PERM( 2, 1, 2, 3 ) ); // 12 03 02 03
    const register __m128               mi034 = _mm_shuffle_ps( mi02, miR0, _MM_PERM( 3, 0, 3, 2 ) ); // 13 02 03 02
    const register __m128               mi033 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 3, 2, 3, 2 ) ); // 23 22 13 12
    const register __m128               mi035 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 2, 3, 2, 3 ) ); // 22 23 12 13
    const register __m128               mi036 = _mm_mul_ps( mi032, mi033 );
    const register __m128               mi037 = _mm_mul_ps( mi034, mi035 );
    const register __m128               mi038 = _mm_sub_ps( mi036, mi037 );
    const register __m128               mi039 = _mm_mul_ps( mi031, mi038 );
    const register __m128               mi040 = _mm_div_ps( mi039, tvDet );

    const register __m128               mi041 = _mm_shuffle_ps( miC0, miC0, _MM_PERM( 1, 0, 0, 0 ) ); // 10 00 00 00
    const register __m128               mi043 = _mm_shuffle_ps( miR3, mi04, _MM_PERM( 2, 3, 2, 1 ) ); // 32 33 32 23
    const register __m128               mi045 = _mm_shuffle_ps( miR3, mi04, _MM_PERM( 3, 2, 3, 0 ) ); // 33 32 33 22
    const register __m128               mi042 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 3, 2, 3, 2 ) ); // 23 22 13 12
    const register __m128               mi044 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 2, 3, 2, 3 ) ); // 22 23 12 13
    const register __m128               mi046 = _mm_mul_ps( mi042, mi043 );
    const register __m128               mi047 = _mm_mul_ps( mi044, mi045 );
    const register __m128               mi048 = _mm_sub_ps( mi046, mi047 );
    const register __m128               mi049 = _mm_mul_ps( mi041, mi048 );
    const register __m128               mi050 = _mm_div_ps( mi049, tvDet );
    const register __m128               mi051 = _mm_shuffle_ps( miC0, miC0, _MM_PERM( 2, 2, 1, 1 ) ); // 20 20 10 10
    const register __m128               mi053 = _mm_shuffle_ps( miR3, mi04, _MM_PERM( 3, 2, 3, 0 ) ); // 33 32 33 22
    const register __m128               mi055 = _mm_shuffle_ps( miR3, mi04, _MM_PERM( 2, 3, 2, 1 ) ); // 32 33 32 23
    const register __m128               mi052 = _mm_shuffle_ps( mi02, miR0, _MM_PERM( 2, 1, 2, 3 ) ); // 12 03 02 03
    const register __m128               mi054 = _mm_shuffle_ps( mi02, miR0, _MM_PERM( 3, 0, 3, 2 ) ); // 13 02 03 02
    const register __m128               mi056 = _mm_mul_ps( mi052, mi053 );
    const register __m128               mi057 = _mm_mul_ps( mi054, mi055 );
    const register __m128               mi058 = _mm_sub_ps( mi056, mi057 );
    const register __m128               mi059 = _mm_mul_ps( mi051, mi058 );
    const register __m128               mi060 = _mm_div_ps( mi059, tvDet );
    const register __m128               mi061 = _mm_shuffle_ps( miC0, miC0, _MM_PERM( 3, 3, 3, 2 ) ); // 30 30 30 20
    const register __m128               mi063 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 2, 3, 2, 3 ) ); // 22 23 12 13
    const register __m128               mi065 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 3, 2, 3, 2 ) ); // 23 22 13 12
    const register __m128               mi062 = _mm_shuffle_ps( mi02, miR0, _MM_PERM( 3, 0, 3, 2 ) ); // 13 02 03 02
    const register __m128               mi064 = _mm_shuffle_ps( mi02, miR0, _MM_PERM( 2, 1, 2, 3 ) ); // 12 03 02 03
    const register __m128               mi066 = _mm_mul_ps( mi062, mi063 );
    const register __m128               mi067 = _mm_mul_ps( mi064, mi065 );
    const register __m128               mi068 = _mm_sub_ps( mi066, mi067 );
    const register __m128               mi069 = _mm_mul_ps( mi061, mi068 );
    const register __m128               mi070 = _mm_div_ps( mi069, tvDet );

    const register __m128               mi071 = _mm_shuffle_ps( miC0, miC0, _MM_PERM( 1, 0, 0, 0 ) ); // 10 00 00 00
    const register __m128               mi073 = _mm_shuffle_ps( miR3, mi07, _MM_PERM( 3, 1, 3, 1 ) ); // 33 31 33 21
    const register __m128               mi075 = _mm_shuffle_ps( miR3, mi08, _MM_PERM( 1, 3, 1, 3 ) ); // 31 33 31 23
    const register __m128               mi072 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 1, 3, 1, 3 ) ); // 21 23 11 13
    const register __m128               mi074 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 3, 1, 3, 1 ) ); // 23 21 13 11
    const register __m128               mi076 = _mm_mul_ps( mi072, mi073 );
    const register __m128               mi077 = _mm_mul_ps( mi074, mi075 );
    const register __m128               mi078 = _mm_sub_ps( mi076, mi077 );
    const register __m128               mi079 = _mm_mul_ps( mi071, mi078 );
    const register __m128               mi080 = _mm_div_ps( mi079, tvDet );
    const register __m128               mi081 = _mm_shuffle_ps( miC0, miC0, _MM_PERM( 2, 2, 1, 1 ) ); // 20 20 10 10
    const register __m128               mi083 = _mm_shuffle_ps( miR3, mi08, _MM_PERM( 1, 3, 1, 3 ) ); // 31 33 31 23
    const register __m128               mi085 = _mm_shuffle_ps( miR3, mi07, _MM_PERM( 3, 1, 3, 1 ) ); // 33 31 33 21
    const register __m128               mi082 = _mm_shuffle_ps( mi05, miR0, _MM_PERM( 3, 1, 3, 1 ) ); // 13 01 03 01
    const register __m128               mi084 = _mm_shuffle_ps( mi06, miR0, _MM_PERM( 1, 3, 1, 3 ) ); // 11 03 01 03
    const register __m128               mi086 = _mm_mul_ps( mi082, mi083 );
    const register __m128               mi087 = _mm_mul_ps( mi084, mi085 );
    const register __m128               mi088 = _mm_sub_ps( mi086, mi087 );
    const register __m128               mi089 = _mm_mul_ps( mi081, mi088 );
    const register __m128               mi090 = _mm_div_ps( mi089, tvDet );
    const register __m128               mi091 = _mm_shuffle_ps( miC0, miC0, _MM_PERM( 3, 3, 3, 2 ) ); // 30 30 30 20
    const register __m128               mi093 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 3, 1, 3, 1 ) ); // 23 21 13 11
    const register __m128               mi095 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 1, 3, 1, 3 ) ); // 21 23 11 13
    const register __m128               mi092 = _mm_shuffle_ps( mi06, miR0, _MM_PERM( 1, 3, 1, 3 ) ); // 11 03 01 03
    const register __m128               mi094 = _mm_shuffle_ps( mi05, miR0, _MM_PERM( 3, 1, 3, 1 ) ); // 13 01 03 01
    const register __m128               mi096 = _mm_mul_ps( mi092, mi093 );
    const register __m128               mi097 = _mm_mul_ps( mi094, mi095 );
    const register __m128               mi098 = _mm_sub_ps( mi096, mi097 );
    const register __m128               mi099 = _mm_mul_ps( mi091, mi098 );
    const register __m128               mi100 = _mm_div_ps( mi099, tvDet );

    const register __m128               mi101 = _mm_shuffle_ps( miC0, miC0, _MM_PERM( 1, 0, 0, 0 ) ); // 10 00 00 00
    const register __m128               mi103 = _mm_shuffle_ps( miR3, mi08, _MM_PERM( 1, 2, 1, 2 ) ); // 31 32 31 22
    const register __m128               mi105 = _mm_shuffle_ps( miR3, mi07, _MM_PERM( 2, 1, 2, 1 ) ); // 32 31 32 21
    const register __m128               mi102 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 2, 1, 2, 1 ) ); // 22 21 12 11
    const register __m128               mi104 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 1, 2, 1, 2 ) ); // 21 22 11 12
    const register __m128               mi106 = _mm_mul_ps( mi102, mi103 );
    const register __m128               mi107 = _mm_mul_ps( mi104, mi105 );
    const register __m128               mi108 = _mm_sub_ps( mi106, mi107 );
    const register __m128               mi109 = _mm_mul_ps( mi101, mi108 );
    const register __m128               mi110 = _mm_div_ps( mi109, tvDet );
    const register __m128               mi111 = _mm_shuffle_ps( miC0, miC0, _MM_PERM( 2, 2, 1, 1 ) ); // 20 20 10 10
    const register __m128               mi113 = _mm_shuffle_ps( miR3, mi07, _MM_PERM( 2, 1, 2, 1 ) ); // 32 31 32 21
    const register __m128               mi115 = _mm_shuffle_ps( miR3, mi08, _MM_PERM( 1, 2, 1, 2 ) ); // 31 32 31 22
    const register __m128               mi112 = _mm_shuffle_ps( mi06, miR0, _MM_PERM( 1, 2, 1, 2 ) ); // 11 02 01 02
    const register __m128               mi114 = _mm_shuffle_ps( mi05, miR0, _MM_PERM( 2, 1, 2, 1 ) ); // 12 01 02 01
    const register __m128               mi116 = _mm_mul_ps( mi112, mi113 );
    const register __m128               mi117 = _mm_mul_ps( mi114, mi115 );
    const register __m128               mi118 = _mm_sub_ps( mi116, mi117 );
    const register __m128               mi119 = _mm_mul_ps( mi111, mi118 );
    const register __m128               mi120 = _mm_div_ps( mi119, tvDet );
    const register __m128               mi121 = _mm_shuffle_ps( miC0, miC0, _MM_PERM( 3, 3, 3, 2 ) ); // 30 30 30 20
    const register __m128               mi123 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 1, 2, 1, 2 ) ); // 21 22 11 12
    const register __m128               mi125 = _mm_shuffle_ps( miR2, miR1, _MM_PERM( 2, 1, 2, 1 ) ); // 22 21 12 11
    const register __m128               mi122 = _mm_shuffle_ps( mi05, miR0, _MM_PERM( 2, 1, 2, 1 ) ); // 12 01 02 01
    const register __m128               mi124 = _mm_shuffle_ps( mi06, miR0, _MM_PERM( 1, 2, 1, 2 ) ); // 11 02 01 02
    const register __m128               mi126 = _mm_mul_ps( mi122, mi123 );
    const register __m128               mi127 = _mm_mul_ps( mi124, mi125 );
    const register __m128               mi128 = _mm_sub_ps( mi126, mi127 );
    const register __m128               mi129 = _mm_mul_ps( mi121, mi128 );
    const register __m128               mi130 = _mm_div_ps( mi129, tvDet );

    ptmRet->m_atvRow[0].m_mData = _mm_add_ps( mi020, _mm_add_ps( mi030, mi040 ) );
    ptmRet->m_atvRow[1].m_mData = _mm_add_ps( mi050, _mm_add_ps( mi060, mi070 ) );
    ptmRet->m_atvRow[2].m_mData = _mm_add_ps( mi080, _mm_add_ps( mi090, mi100 ) );
    ptmRet->m_atvRow[3].m_mData = _mm_add_ps( mi110, _mm_add_ps( mi120, mi130 ) );
};


TgVEC_M_F32_04 M_DET_F32_44( CPCU_TgMAT_F32_44 ptmM1 )
{
    const register __m128               miR0 = ptmM1->m_atvRow[0].m_mData;
    const register __m128               miR1 = ptmM1->m_atvRow[1].m_mData;
    const register __m128               miR2 = ptmM1->m_atvRow[2].m_mData;
    const register __m128               miR3 = ptmM1->m_atvRow[3].m_mData;

    const register __m128               mi01 = _mm_shuffle_ps( miR2, miR2, _MM_PERM( 2, 3, 1, 3 ) );
    const register __m128               mi02 = _mm_shuffle_ps( miR2, miR2, _MM_PERM( 2, 3, 0, 3 ) );
    const register __m128               mi03 = _mm_shuffle_ps( miR2, miR2, _MM_PERM( 1, 3, 0, 3 ) );
    const register __m128               mi04 = _mm_shuffle_ps( miR2, miR2, _MM_PERM( 1, 2, 0, 3 ) );
    const register __m128               mi05 = _mm_shuffle_ps( miR3, miR3, _MM_PERM( 3, 1, 2, 3 ) );
    const register __m128               mi06 = _mm_shuffle_ps( miR3, miR3, _MM_PERM( 3, 0, 2, 3 ) );
    const register __m128               mi07 = _mm_shuffle_ps( miR3, miR3, _MM_PERM( 3, 0, 1, 3 ) );
    const register __m128               mi08 = _mm_shuffle_ps( miR3, miR3, _MM_PERM( 2, 0, 1, 3 ) );
    const register __m128               mi17 = _mm_mul_ps( mi01, mi05 );
    const register __m128               mi18 = _mm_mul_ps( mi02, mi06 );
    const register __m128               mi19 = _mm_mul_ps( mi03, mi07 );
    const register __m128               mi20 = _mm_mul_ps( mi04, mi08 );

    const register __m128               mi09 = _mm_shuffle_ps( miR2, miR2, _MM_PERM( 3, 1, 2, 3 ) );
    const register __m128               mi10 = _mm_shuffle_ps( miR2, miR2, _MM_PERM( 3, 0, 2, 3 ) );
    const register __m128               mi11 = _mm_shuffle_ps( miR2, miR2, _MM_PERM( 3, 0, 1, 3 ) );
    const register __m128               mi12 = _mm_shuffle_ps( miR2, miR2, _MM_PERM( 2, 0, 1, 3 ) );
    const register __m128               mi13 = _mm_shuffle_ps( miR3, miR3, _MM_PERM( 2, 3, 1, 3 ) );
    const register __m128               mi14 = _mm_shuffle_ps( miR3, miR3, _MM_PERM( 2, 3, 0, 3 ) );
    const register __m128               mi15 = _mm_shuffle_ps( miR3, miR3, _MM_PERM( 1, 3, 0, 3 ) );
    const register __m128               mi16 = _mm_shuffle_ps( miR3, miR3, _MM_PERM( 1, 2, 0, 3 ) );
    const register __m128               mi21 = _mm_mul_ps( mi09, mi13 );
    const register __m128               mi22 = _mm_mul_ps( mi10, mi14 );
    const register __m128               mi23 = _mm_mul_ps( mi11, mi15 );
    const register __m128               mi24 = _mm_mul_ps( mi12, mi16 );

    const register __m128               mi25 = _mm_sub_ps( mi17, mi21 );
    const register __m128               mi26 = _mm_sub_ps( mi18, mi22 );
    const register __m128               mi27 = _mm_sub_ps( mi19, mi23 );
    const register __m128               mi28 = _mm_sub_ps( mi20, mi24 );

    const register __m128               mi29 = _mm_shuffle_ps( miR1, miR1, _MM_PERM( 1, 2, 3, 3 ) );
    const register __m128               mi30 = _mm_shuffle_ps( miR1, miR1, _MM_PERM( 0, 2, 3, 3 ) );
    const register __m128               mi31 = _mm_shuffle_ps( miR1, miR1, _MM_PERM( 0, 1, 3, 3 ) );
    const register __m128               mi32 = _mm_shuffle_ps( miR1, miR1, _MM_PERM( 0, 1, 2, 3 ) );
    const register __m128               mi33 = _mm_mul_ps( mi29, mi25 );
    const register __m128               mi34 = _mm_mul_ps( mi30, mi26 );
    const register __m128               mi35 = _mm_mul_ps( mi31, mi27 );
    const register __m128               mi36 = _mm_mul_ps( mi32, mi28 );

    const register __m128               mi37 = _mm_shuffle_ps( miR0, miR0, _MM_PERM( 0, 0, 0, 0 ) );
    const register __m128               mi38 = _mm_shuffle_ps( miR0, miR0, _MM_PERM( 1, 1, 1, 1 ) );
    const register __m128               mi39 = _mm_shuffle_ps( miR0, miR0, _MM_PERM( 2, 2, 2, 2 ) );
    const register __m128               mi40 = _mm_shuffle_ps( miR0, miR0, _MM_PERM( 3, 3, 3, 3 ) );
    const register __m128               mi41 = _mm_mul_ps( mi37, mi33 );
    const register __m128               mi42 = _mm_mul_ps( mi38, mi34 );
    const register __m128               mi43 = _mm_mul_ps( mi39, mi35 );
    const register __m128               mi44 = _mm_mul_ps( mi40, mi36 );

    const register __m128               mi45 = _mm_sub_ps( mi41, mi42 );
    const register __m128               mi46 = _mm_sub_ps( mi43, mi44 );
    const register __m128               mi47 = _mm_add_ps( mi45, mi46 );

    const register __m128               mi48 = _mm_shuffle_ps( mi47, mi47, _MM_PERM( 1, 1, 1, 1 ) );
    const register __m128               mi49 = _mm_shuffle_ps( mi47, mi47, _MM_PERM( 2, 2, 2, 2 ) );
    const register __m128               mi50 = _mm_add_ss( mi47, mi48 );
    const register __m128               mi51 = _mm_add_ss( mi50, mi49 );

    return ((_mm_shuffle_ps( mi51, mi51, 0x00 )));
};